/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */

/*!
 * \file src/runtime/contrib/clml/clml_runtime.cc
 * \brief A simple JSON runtime for CLML.
 */
#include "clml_runtime.h"

#include <tvm/ffi/reflection/registry.h>

#include <unordered_map>

#ifdef TVM_GRAPH_EXECUTOR_CLML
#include "clml_memory_planner.h"
#include "clml_utils.h"
#endif

#include <tvm/runtime/profiling.h>

namespace tvm {
namespace runtime {
namespace contrib {

using namespace tvm::runtime::json;
using JSONGraphNode = tvm::runtime::json::JSONGraphNode;

#ifdef TVM_GRAPH_EXECUTOR_CLML
CLMLThreadEntry* CLMLWorkspace::GetThreadEntry() { return CLMLThreadEntry::ThreadLocal(); }

CLMLWorkspace* CLMLWorkspace::Global() {
  static CLMLWorkspace* inst = new CLMLWorkspace();
  return inst;
}

CLMLWorkspace::CLMLWorkspace() {
  cl_int result = 0;
  workspace = cl::OpenCLWorkspace::Global();
  workspace->Init();
  tentry = workspace->GetThreadEntry();

  device_id = workspace->GetCLDeviceID(tentry->device.device_id);
  platform_id = workspace->device_info[device_id].platform_id;

  // Print extensions
  size_t reqd_size = 0;
  result = clGetDeviceInfo(device_id, CL_DEVICE_EXTENSIONS, 0, nullptr, &reqd_size);
  ICHECK(reqd_size > 0u && result == CL_SUCCESS) << "clGetDeviceInfo:" << result;
  std::vector<char> extn_buf(reqd_size);
  result = clGetDeviceInfo(device_id, CL_DEVICE_EXTENSIONS, reqd_size, extn_buf.data(), nullptr);
  ICHECK(result == CL_SUCCESS) << "clGetDeviceInfo:" << result;
  std::string extensions(extn_buf.data());
  LOG_CLML << "OpenCL Extensions:" << extensions;

  if (extensions.find("cl_qcom_ml_ops") == std::string::npos) {
    LOG(FATAL) << "CLML Runtime Init: Qualcomm extn not present.\n";
    return;
  }
  if (getenv("CLML_DISABLE_RECORDABLE_QUEUE")) {
    is_recordable_queue = 0;
    is_on_chip_memory = 0;
  } else {
    is_recordable_queue = (extensions.find("cl_qcom_recordable_queues") != std::string::npos);
    is_on_chip_memory = (extensions.find("cl_qcom_onchip_global_memory") != std::string::npos);
    LOG_CLML << "Recordable Queues Support :" << is_recordable_queue;
    LOG_CLML << "On chip Memory Support :" << is_on_chip_memory;
  }

  if (is_on_chip_memory) {
    result = clGetDeviceInfo(device_id, CL_DEVICE_ONCHIP_GLOBAL_MEM_SIZE_QCOM,
                             sizeof(onchip_mem_size), &onchip_mem_size, nullptr);
    ICHECK(result == CL_SUCCESS) << "clGetDeviceInfo(CL_DEVICE_ONCHIP_GLOBAL_MEM_SIZE_QCOM):"
                                 << result;
    LOG_CLML << "On chip memory size:" << onchip_mem_size;
  }

  // Query and Get CLML Interface
  static const cl_uint MAX_VERSIONS = 256;
  cl_int majorVersions[MAX_VERSIONS];
  cl_int minorVersions[MAX_VERSIONS];
  cl_uint numVersions = 0;
  result = clQueryMLInterfaceVersionsQCOM(nullptr, nullptr, 0, &numVersions);
  ICHECK(result == CL_SUCCESS) << "clQueryMLInterfaceVersionsQCOM:" << result;
  ICHECK(numVersions > 0u);
  ICHECK(numVersions <= MAX_VERSIONS);

  result = clQueryMLInterfaceVersionsQCOM(majorVersions, minorVersions, numVersions, nullptr);
  ICHECK(result == CL_SUCCESS) << "clQueryMLInterfaceVersionsQCOM:" << result;

  target_major = majorVersions[numVersions - 1];
  target_minor = minorVersions[numVersions - 1];

  LOG(WARNING) << "CLML Target Version:" << target_major << "." << target_minor;

  if (target_major > CL_QCOM_ML_OPS_H_MAJOR_VERSION) {
    LOG(WARNING) << "Runtime is compiled with " << CL_QCOM_ML_OPS_H_MAJOR_VERSION
                 << "where as target supports " << target_major
                 << "\nTrying to use API interface version:" << CL_QCOM_ML_OPS_H_MAJOR_VERSION
                 << "\nSome functionality may not work as expected ...";
    target_major = CL_QCOM_ML_OPS_H_MAJOR_VERSION;
    target_minor = 0;
  }

  clGetMLInterfaceQCOM(&h_ClmlIntf, target_major, target_minor);

  ICHECK(nullptr != h_ClmlIntf) << "Couldn't get API interface, target is not supported."
                                << "Compiled version: " << CL_QCOM_ML_OPS_H_MAJOR_VERSION << "."
                                << CL_QCOM_ML_OPS_H_MINOR_VERSION
                                << "Target Version:" << target_major << "." << target_minor;

  char* tune_flag;
  if ((tune_flag = getenv("CLML_IS_TUNING_RUN")))
    is_tuning_run = std::stoi(tune_flag);
  else
    is_tuning_run = 0;

  if (!(tuning_file = getenv("CLML_TUNING_CACHE"))) this->is_tuning_run = 0;
}

typedef dmlc::ThreadLocalStore<CLMLThreadEntry> CLMLThreadStore;

CLMLThreadEntry* CLMLThreadEntry::ThreadLocal() { return CLMLThreadStore::Get(); }
#endif

class CLMLRuntime : public JSONRuntimeBase {
 public:
  /*!
   * \brief The CLML runtime module. Deserialize the provided functions
   * on creation and store in the layer cache.
   *
   * \param symbol_name The name of the function.
   * \param graph_json serialized JSON representation of a sub-graph.
   * \param const_names The names of each constant in the sub-graph.
   */
  explicit CLMLRuntime(const std::string& symbol_name, const std::string& graph_json,
                       const ffi::Array<ffi::String>& const_names)
      : JSONRuntimeBase(symbol_name, graph_json, const_names), clml_symbol(symbol_name) {}

  ~CLMLRuntime() {
#ifdef TVM_GRAPH_EXECUTOR_CLML
    cl_int result = 0;
    if (this->layer_.tuning_cache) {
      CLML_CALL(clReleaseMLTuningCacheQCOM, this->layer_.tuning_cache);
    }
    for (auto it = this->layer_.storage_map.begin(); it != this->layer_.storage_map.end(); it++) {
      auto tensor_desc = it->second.tensor_desc;
      CLML_CALL(clReleaseMLTensorQCOM, tensor_desc->tensor)
      if (this->layer_.ddr_storage_ref_map.find(tensor_desc->memory) !=
          this->layer_.ddr_storage_ref_map.end()) {
        ReleaseDDRMemory(tensor_desc->memory);
      } else {
        result = clReleaseMemObject(tensor_desc->memory);
        ICHECK(result == CL_SUCCESS) << "clReleaseMemObject:" << result;
      }
    }
    for (size_t i = 0; i < this->layer_.function.size(); ++i) {
      CLML_CALL(clReleaseMLOpQCOM, this->layer_.function[i])
    }
    for (auto it = this->layer_.in_placeholder.begin(); it != this->layer_.in_placeholder.end();
         it++) {
      CLML_CALL(clReleaseMLTensorQCOM, it->second->tensor)
    }
    for (auto it = this->layer_.out_placeholder.begin(); it != this->layer_.out_placeholder.end();
         it++) {
      CLML_CALL(clReleaseMLTensorQCOM, (*it)->tensor)
    }
    CLML_CALL(clReleaseMLTensorMemoryDescriptorSetQCOM, layer_.descriptorSet)

    if (this->layer_.recordable_queue) {
      clReleaseCommandQueue(this->layer_.recordable_queue);
    }
#endif
  }

  /*!
   * \brief The type key of the module.
   *
   * \return module type key.
   */
  const char* kind() const override { return "clml"; }

  /*!
   * \brief Initialize runtime. Create CLML layer from JSON
   * representation.
   *
   * \param consts The constant params from compiled model.
   */
  void Init(const ffi::Array<Tensor>& consts) override {
    ICHECK_EQ(consts.size(), const_idx_.size())
        << "The number of input constants must match the number of required.";
    SetupConstants(consts);

#ifdef TVM_GRAPH_EXECUTOR_CLML
    InitCLML();
#endif

    BuildEngine();
  }

#ifdef TVM_GRAPH_EXECUTOR_CLML
  void InitCLML() {
    // Setup CLML Context
    cl_int result = 0;
    cws = CLMLWorkspace::Global();

    if (cws->is_recordable_queue) {
      this->layer_.recordable_queue =
          clCreateCommandQueue(CLML_CTX, cws->device_id, CL_QUEUE_RECORDABLE_QCOM, &result);
      ICHECK(result == CL_SUCCESS) << "clCreateCommandQueue - Recordable:" << result;

      this->layer_.recording = clNewRecordingQCOM(this->layer_.recordable_queue, &result);
      ICHECK(result == CL_SUCCESS) << "clNewRecordingQCOM:" << result;
    }

    // A Tuning run, so create the cache from scratch
    CLML_CALL(clCreateMLTuningCacheQCOM, &layer_.tuning_cache)
    if (!cws->is_tuning_run && cws->tuning_file) {
      std::vector<unsigned char> tune_buffer;
      std::string tune_blob;
      LoadBinaryFromFile(cws->tuning_file, &tune_blob);
      dmlc::MemoryStringStream mstrm(const_cast<std::string*>(&tune_blob));
      dmlc::Stream* strm = &mstrm;

      uint64_t header, reserve;
      std::string tune_symbol;
      while (strm->Read(&header)) {
        if (header != kTVMCLMLTuningCacheMagic) break;
        if (!strm->Read(&reserve)) break;
        if (!strm->Read(&tune_symbol)) break;
        if (tune_symbol == clml_symbol) {
          strm->Read(&tune_buffer);
          break;
        } else {
          std::vector<unsigned char> tmp_buf;
          if (!strm->Read(&tmp_buf)) break;
        }
      }

      if (tune_buffer.size()) {
        LOG(INFO) << "Loading tuning cache for symbol:" << clml_symbol
                  << " size:" << tune_buffer.size();
        CLML_CALL(clLoadMLTuningCacheQCOM, layer_.tuning_cache, tune_buffer.size(),
                  tune_buffer.data())
      } else {
        LOG(WARNING) << "Tuning cache not cound for symbol :" << clml_symbol << " in file "
                     << cws->tuning_file;
      }
    }
  }

  std::string DebugDump(void) override {
    if (cws->is_recordable_queue) {
      LOG(FATAL) << "Debugging over recordable queues is not supported yet. You may disable the "
                    "same by exporting CLML_DISABLE_RECORDABLE_QUEUE at runtime.";
    }
    cl_command_queue queue = CLML_QUEUE;
    ffi::Map<ffi::String, Tensor> dump_tensors;
    std::ostringstream os;
    dmlc::JSONWriter writer(&os);
    writer.BeginObject();

    writer.WriteObjectKeyValue("graph", graph_json_);

    int op_index = 0;
    for (auto it = this->layer_.storage_map.begin(); it != this->layer_.storage_map.end(); it++) {
      int nid = it->first;
      auto clml_desc = it->second.tensor_desc;
      auto node = it->second.node;

      if ("kernel" == node.GetOpType()) {
        CLML_CALL(clEnqueueMLOpQCOM, queue, this->layer_.function[op_index],
                  this->layer_.descriptorSet, 0, nullptr, nullptr);
        OPENCL_CALL(clFinish(queue));
        op_index++;
      }

      // Dump tensor to CPU
      std::vector<int64_t> shape = node.GetOpShape()[0];
      DLDataType tvm_dtype = node.GetOpDataType()[0];
      Tensor narr = Tensor::Empty(ffi::Shape(shape), tvm_dtype, {kDLCPU, 0});
      CopyDataFromCLMLTensor(clml_desc, narr.operator->()->data);

      // Naming convention
      std::string node_name;
      bool is_out = false;
      for (size_t i = 0; i < outputs_.size(); ++i) {
        uint32_t eid = EntryID(outputs_[i]);
        is_out = (eid == nid);
      }
      if (is_out) {
        node_name = clml_symbol + "_layer_out_" + std::to_string(nid);
      } else if (("const" == node.GetOpType()) || ("input" == node.GetOpType())) {
        node_name = node.GetOpName();
      } else {
        node_name = node.GetOpName() + "____topo-index:" + std::to_string(nid);
      }
      dump_tensors.Set(node_name, narr);
    }

    const auto f = tvm::ffi::Function::GetGlobal("runtime.SaveParams");
    if (f.has_value()) {
      std::string dump_bytes = (*f)(dump_tensors).cast<ffi::String>();
      std::ostringstream oss;
      /*TODO(Siva) HEX encoding doubles the size, look for better encode that can cross the RPC. */
      for (size_t i = 0; i < dump_bytes.size(); ++i) {
        oss << std::setw(2) << std::setfill('0') << std::hex << static_cast<int>(dump_bytes[i]);
      }
      writer.WriteObjectKeyValue("tensors", oss.str());
    }

    writer.EndObject();
    return os.str();
  }

  void RunProfile(profiling::Profiler* prof) override {
    cl_command_queue queue = CLML_QUEUE;
    std::vector<cl_event>& evts = cws->workspace->GetEventQueue(cws->tentry->device);
    std::vector<profiling::MetricCollector> cs;
    std::vector<Device> devices;
    devices.push_back(cws->tentry->device);

    for (size_t i = 0; i < input_nodes_.size(); ++i) {
      auto nid = input_nodes_[i];
      uint32_t eid = EntryID(nid, 0);
      if (nodes_[nid].GetOpType() == "input") {
        // Assuming all inputs are from OpenCL
        if (kDLOpenCL == data_entry_[eid]->device.device_type) {
          layer_.in_placeholder[nid]->memory = static_cast<cl_mem>(
              ((cl::BufferDescriptor*)const_cast<DLTensor*>(data_entry_[eid])->data)->buffer);
          cl_event cpy_evt = nullptr;
          cl_event* evt = &cpy_evt;
          if (cws->workspace->IsProfiling(cws->tentry->device)) {
            evts.resize(evts.size() + 1);
            evt = &(evts.back());
          }
          std::unordered_map<std::string, ffi::Any> metrics;
          std::string shape_str;
          std::vector<int64_t> shape = nodes_[nid].GetOpShape()[0];
          DLDataType tvm_dtype = nodes_[nid].GetOpDataType()[0];
          shape_str.append(profiling::ShapeString(shape, tvm_dtype));
          metrics["Argument Shapes"] = ffi::String(shape_str);

          prof->StartCall("CopyIn", cws->tentry->device, metrics);
          CLML_CALL(clEnqueueCopyMLTensorDataQCOM, queue, layer_.in_placeholder[nid]->tensor,
                    layer_.in_placeholder[nid]->memory, layer_.inputs[nid]->tensor,
                    layer_.inputs[nid]->memory, 0, nullptr, evt);
          prof->StopCall();
        }
      }
    }

    for (size_t i = 0; i < this->layer_.function.size(); ++i) {
      std::unordered_map<std::string, ffi::Any> metrics;
      auto node = this->layer_.op_node_map[this->layer_.function[i]].second;
      std::string shape_str;
      for (uint32_t j = 0; j < node.GetInputs().size(); ++j) {
        const JSONGraphNode in_node = nodes_[node.GetInputs()[j].id_];
        std::vector<int64_t> shape = in_node.GetOpShape()[0];
        DLDataType tvm_dtype = in_node.GetOpDataType()[0];
        shape_str.append(profiling::ShapeString(shape, tvm_dtype));
        shape_str.append(", ");
      }
      // Assuming one output per operation
      std::vector<int64_t> shape = node.GetOpShape()[0];
      DLDataType tvm_dtype = node.GetOpDataType()[0];
      shape_str.append(profiling::ShapeString(shape, tvm_dtype));
      metrics["Argument Shapes"] = ffi::String(shape_str);

      // Launch call
      prof->StartCall(clml_symbol + "-" + this->layer_.layer_names[i], cws->tentry->device,
                      metrics);
      queue = CLML_QUEUE;
      evts.resize(evts.size() + 1);
      cl_event* evt = &(evts.back());
      CLML_CALL(clEnqueueMLOpQCOM, queue, this->layer_.function[i], this->layer_.descriptorSet, 0,
                nullptr, evt);
      prof->StopCall();
    }

    for (size_t i = 0; i < outputs_.size(); ++i) {
      uint32_t eid = EntryID(outputs_[i]);

      // Assuming all outputs are to OpenCL
      if (kDLOpenCL == data_entry_[eid]->device.device_type) {
        layer_.out_placeholder[i]->memory = static_cast<cl_mem>(
            ((cl::BufferDescriptor*)const_cast<DLTensor*>(data_entry_[eid])->data)->buffer);
        cl_event cpy_evt = nullptr;
        cl_event* evt = &cpy_evt;
        if (cws->workspace->IsProfiling(cws->tentry->device)) {
          evts.resize(evts.size() + 1);
          evt = &(evts.back());
        }

        std::unordered_map<std::string, ffi::Any> metrics;
        std::string shape_str;
        std::vector<int64_t> shape = nodes_[eid].GetOpShape()[0];
        DLDataType tvm_dtype = nodes_[eid].GetOpDataType()[0];
        shape_str.append(profiling::ShapeString(shape, tvm_dtype));
        metrics["Argument Shapes"] = ffi::String(shape_str);

        prof->StartCall("CopyOut", cws->tentry->device, metrics);
        CLML_CALL(clEnqueueCopyMLTensorDataQCOM, queue, layer_.outputs[i]->tensor,
                  layer_.outputs[i]->memory, layer_.out_placeholder[i]->tensor,
                  layer_.out_placeholder[i]->memory, 0, nullptr, evt);
        prof->StopCall();
      }
    }

    return;
  }

  /*!
   * \brief Unpack inputs and outputs and run inference on a given layer.
   *
   * \param args Access inputs and outputs.
   * \param function The layer to execute inference on.
   * \return Status of inference.
   */
  void Run() override {
    LOG_CLML << "Run Start";
    cl_command_queue queue = CLML_QUEUE;
    std::vector<cl_event>& evts = cws->workspace->GetEventQueue(cws->tentry->device);
    for (size_t i = 0; i < input_nodes_.size(); ++i) {
      auto nid = input_nodes_[i];
      uint32_t eid = EntryID(nid, 0);
      if (nodes_[nid].GetOpType() == "input") {
        void* data = data_entry_[eid]->data;
        size_t isize = 1;
        for (size_t j = 0; j < data_entry_[eid]->ndim; ++j) {
          isize *= data_entry_[eid]->shape[j];
        }
        if (kDLCPU == data_entry_[eid]->device.device_type) {
          CopyDataToCLMLTensor(layer_.inputs[nid], data);
        } else if (kDLOpenCL == data_entry_[eid]->device.device_type) {
          layer_.in_placeholder[nid]->memory = static_cast<cl_mem>(
              ((cl::BufferDescriptor*)const_cast<DLTensor*>(data_entry_[eid])->data)->buffer);
          cl_event cpy_evt = nullptr;
          cl_event* evt = &cpy_evt;
          if (cws->workspace->IsProfiling(cws->tentry->device)) {
            evts.resize(evts.size() + 1);
            evt = &(evts.back());
          }
          LOG_CLML << "Enqueue CLML Copy";
          CLML_CALL(clEnqueueCopyMLTensorDataQCOM, queue, layer_.in_placeholder[nid]->tensor,
                    layer_.in_placeholder[nid]->memory, layer_.inputs[nid]->tensor,
                    layer_.inputs[nid]->memory, 0, nullptr, evt);
          LOG_CLML << "Enqueue CLML Copy Completed";
        } else {
          DLDataType tvm_dtype = const_cast<DLTensor*>(data_entry_[eid])->dtype;
          cl_channel_type cl_dtype = MakeCLDataType(tvm_dtype);
          int dtype_size = cl_dtype == CL_FLOAT ? 4 : 2;
          void* tmpptr = reinterpret_cast<void*>(malloc(isize * dtype_size));
          Tensor::CopyToBytes(const_cast<DLTensor*>(data_entry_[eid]), const_cast<void*>(tmpptr),
                              isize * dtype_size);
          CopyDataToCLMLTensor(layer_.inputs[nid], tmpptr);
          free(tmpptr);
        }
      }
    }
    LOG_CLML << "Inputs Set";

    int64_t duration = 0;
    if (cws->is_recordable_queue) {
      LOG_CLML << "Execution by Rec Queue";
      if (cws->workspace->IsProfiling(cws->tentry->device)) {
        Timer t;
        auto f = tvm::ffi::Function::GetGlobal(std::string("profiling.timer.opencl"));
        t = f->operator()(cws->tentry->device).cast<Timer>();
        t->Start();
        queue = CLML_QUEUE;
        evts.resize(evts.size() + 1);
        cl_event* evt = &(evts.back());
        CLML_CALL(clEnqueueRecordingMLOpQCOM, queue, this->layer_.recording, 0, nullptr, 0, nullptr,
                  0, nullptr, 0, nullptr, 0, nullptr, 0, nullptr, 0, nullptr, 0, nullptr, evt);
        t->Stop();
        duration += t->SyncAndGetElapsedNanos();
      } else {
        CLML_CALL(clEnqueueRecordingMLOpQCOM, queue, this->layer_.recording, 0, nullptr, 0, nullptr,
                  0, nullptr, 0, nullptr, 0, nullptr, 0, nullptr, 0, nullptr, 0, nullptr, nullptr);
      }
    } else {
      LOG_CLML << "Execution by Normal Queue";
      for (size_t i = 0; i < this->layer_.function.size(); ++i) {
        // Make CLML subgraphs accounted by OpenCLTimerNode.
        LOG_CLML << "Run Layer:" << this->layer_.layer_names[i];
        if (cws->workspace->IsProfiling(cws->tentry->device)) {
          Timer t;
          auto f = tvm::ffi::Function::GetGlobal(std::string("profiling.timer.opencl"));
          t = f->operator()(cws->tentry->device).cast<Timer>();
          t->Start();
          queue = CLML_QUEUE;
          evts.resize(evts.size() + 1);
          cl_event* evt = &(evts.back());
          CLML_CALL(clEnqueueMLOpQCOM, queue, this->layer_.function[i], this->layer_.descriptorSet,
                    0, nullptr, evt);
          t->Stop();
          duration += t->SyncAndGetElapsedNanos();
          LOG_CLML << "Layer:" << this->layer_.layer_names[i]
                   << " Duration:" << t->SyncAndGetElapsedNanos();
        } else {
          CLML_CALL(clEnqueueMLOpQCOM, queue, this->layer_.function[i], this->layer_.descriptorSet,
                    0, nullptr, nullptr);
        }
      }
    }
    if (cws->workspace->IsProfiling(cws->tentry->device)) {
      LOG_CLML << "Total Duration for " << clml_symbol << " is:" << duration;
    }

    LOG_CLML << "Run Completed";
    for (size_t i = 0; i < outputs_.size(); ++i) {
      uint32_t eid = EntryID(outputs_[i]);
      void* data = data_entry_[eid]->data;

      size_t osize = 1;
      for (size_t j = 0; j < data_entry_[eid]->ndim; ++j) {
        osize *= data_entry_[eid]->shape[j];
      }
      if (kDLCPU == data_entry_[eid]->device.device_type) {
        CopyDataFromCLMLTensor(layer_.outputs[0], data);
      } else if (kDLOpenCL == data_entry_[eid]->device.device_type) {
        layer_.out_placeholder[i]->memory = static_cast<cl_mem>(
            ((cl::BufferDescriptor*)const_cast<DLTensor*>(data_entry_[eid])->data)->buffer);
        cl_event cpy_evt = nullptr;
        cl_event* evt = &cpy_evt;
        if (cws->workspace->IsProfiling(cws->tentry->device)) {
          evts.resize(evts.size() + 1);
          evt = &(evts.back());
        }
        CLML_CALL(clEnqueueCopyMLTensorDataQCOM, queue, layer_.outputs[i]->tensor,
                  layer_.outputs[i]->memory, layer_.out_placeholder[i]->tensor,
                  layer_.out_placeholder[i]->memory, 0, nullptr, evt);
      } else {
        DLDataType tvm_dtype = const_cast<DLTensor*>(data_entry_[eid])->dtype;
        cl_channel_type cl_dtype = MakeCLDataType(tvm_dtype);
        int dtype_size = cl_dtype == CL_FLOAT ? 4 : 2;

        void* tmpptr = reinterpret_cast<void*>(malloc(osize * dtype_size));
        CopyDataFromCLMLTensor(layer_.outputs[0], tmpptr);
        Tensor::CopyFromBytes(const_cast<DLTensor*>(data_entry_[eid]), const_cast<void*>(tmpptr),
                              osize * dtype_size);
        free(tmpptr);
      }
    }
    LOG_CLML << "Run End";
  }

 private:
  /*!
   * \brief check if the nid is graph output tensor or not.
   *
   */
  bool IsOutputTensor(int nid) {
    for (size_t i = 0; i < outputs_.size(); ++i) {
      if (nid == outputs_[i].id_) return true;
    }
    return false;
  }

  /*!
   * \brief Initialize memory pool.
   *
   */
  void InitMemoryPool(void) {
    layer_.on_chip_pool_size.clear();
    layer_.on_chip_pool_size.insert({0, cws->onchip_mem_size});
    layer_.on_chip_pool_alloc_info.clear();
    layer_.alloc_ping_pong = true;
    layer_.in_chip_total_free = cws->onchip_mem_size;
    layer_.in_chip_total_alloc = 0;
    layer_.on_chip_alert_fail = 0;
  }

  /*!
   * \brief Plan Memory for activations to allocate on on-chip global memory where ever possible.
   *
   */
  void PlanMemory() {
    InitMemoryPool();
    // Build the ref count table for all activation tensors.
    LOG_MEM << "Build Ref Map";
    for (size_t nid = 0; nid < nodes_.size(); ++nid) {
      const auto& node = nodes_[nid];
      if (node.GetOpType() == "kernel") {
        std::vector<JSONGraphNodeEntry> inputs = node.GetInputs();
        for (auto& input_node : inputs) {
          if (nodes_[input_node.id_].GetOpType() != "const") {
            if (layer_.storage_ref_map.find(input_node.id_) == layer_.storage_ref_map.end()) {
              layer_.storage_ref_map.insert({input_node.id_, 1});
              layer_.life_span.insert({input_node.id_, nid});
            } else {
              layer_.storage_ref_map[input_node.id_]++;
              layer_.life_span[input_node.id_] = nid;
            }
          }
        }
      }
    }
    LOG_MEM << "Print Ref Map";

    for (auto it = layer_.storage_ref_map.begin(); it != layer_.storage_ref_map.end(); it++) {
      LOG_MEM << "RefMap:" << it->first << " Count:" << it->second
              << "Life Span:" << layer_.life_span[it->first];
    }

    for (size_t nid = 0; nid < nodes_.size(); ++nid) {
      const auto& node = nodes_[nid];
      uint32_t size = 0;
      if (this->layer_.storage_map.find(nid) == this->layer_.storage_map.end()) {
        // Possible that some nodes are not consumed by any operation
        // Example being nn.pad second argument.
        continue;
      }
      CLML_CALL(clGetMLTensorMemorySizeQCOM, CLML_CTX, layer_.storage_map[nid].tensor_desc->tensor,
                &size);

      if ((node.GetOpType() == "kernel") || (node.GetOpType() == "input")) {
        std::vector<JSONGraphNodeEntry> inputs = node.GetInputs();
        LOG_MEM << "Request :" << size << " Nid:" << nid;
        size_t offset = -1;
        // On-chip memory only for intermediate tensors with in recording scope.
        if ((cws->is_on_chip_memory) && (!IsOutputTensor(nid)) && (node.GetOpType() != "input")) {
          offset = RequestOnChipMemory(&this->layer_, size);
        }
        if (-1 != offset) {
          LOG_MEM << "Got On-Chip Mem:" << offset << "Nid:" << nid;
          layer_.on_chip_pool_alloc_info.insert({offset, nid});
          layer_.on_chip_alloc_plan.insert({nid, std::make_pair(size, offset)});
        } else {
          layer_.on_chip_reject.insert({nid, size});
          // DDR Allocation
          auto ddr_mem = RequestDDRMemory(&this->layer_, size);
          LOG_MEM << "Alloc DDR from global pool for nid:" << nid << " Type:" << node.GetOpType();
          layer_.ddr_alloc_plan.insert({nid, ddr_mem});
        }

        // Now free up the input tensors on-chip memory for reuse.
        for (auto& input_node : inputs) {
          if (nodes_[input_node.id_].GetOpType() != "const") {
            LOG_MEM << "Free Input Mem:" << input_node.id_;
            FreeMemory(&this->layer_, input_node.id_);
          }
        }
      }
    }

    // Stats dump
    size_t in_chip_total_alloc = 0;
    size_t total_reject = 0;
    for (auto it = layer_.on_chip_alloc_plan.begin(); it != layer_.on_chip_alloc_plan.end(); it++) {
      LOG_STATS << " On-chip Alloc:" << it->first << " Size:" << it->second.first
                << " Offset:" << it->second.second;
      in_chip_total_alloc += it->second.first;
    }

    for (auto it = layer_.on_chip_reject.begin(); it != layer_.on_chip_reject.end(); it++) {
      LOG_STATS << "Reject:" << it->first << " Size:" << it->second;
      total_reject += it->second;
    }
    LOG_STATS << "Total On-chip Alloc:" << in_chip_total_alloc + total_reject
              << " On-Chip:" << in_chip_total_alloc << " Reject:" << total_reject
              << " Alert Fail:" << layer_.on_chip_alert_fail;

    auto cws = CLMLWorkspace::Global();
    for (auto it = cws->ddr_global_pool.begin(); it != cws->ddr_global_pool.end(); it++) {
      LOG_STATS << "DDR Global pool - size:" << it->second.first << " Ref:" << it->second.second;
    }
    for (auto it = this->layer_.ddr_storage_ref_map.begin();
         it != this->layer_.ddr_storage_ref_map.end(); it++) {
      LOG_STATS << "DDR Local pool - size:" << it->second.first << " Ref cnt:" << it->second.second;
    }
  }

  /*!
   * \brief Create an CLML tensor from JSON node entry. Lookup storage map before creation.
   * Update input placeholder for NHWC layout
   *
   * \param nid The node index of graph JSON.
   * \param shape shape information of tensor
   * \param layout the tensor layout to be used
   * \param dtype tensor data type
   * \return CLML Tensor descriptor.
   */
  std::shared_ptr<cl_ml_tensor_memory_desc_qcom> MakeCLMLTensorFromJSONEntry(
      size_t nid, std::vector<size_t> shape, cl_ml_tensor_layout_qcom layout, cl_uint dtype) {
    const JSONGraphNode node = nodes_[nid];
    cl_ml_tensor_usage_qcom usage = CL_TENSOR_USAGE_CNN_QCOM;

    if (this->layer_.storage_map.find(nid) != this->layer_.storage_map.end()) {
      if (nullptr != layer_.storage_map[nid].tensor_desc) {
        return this->layer_.storage_map[nid].tensor_desc;
      }
    } else {
      this->layer_.storage_map.insert({nid, NodeDescriptor()});
      this->layer_.storage_map[nid].node = node;
    }

    void* node_data = nullptr;
    if (node.GetOpType() == "const") {
      uint32_t eid = EntryID(nid, 0);
      node_data = data_entry_[eid]->data;
      usage = CL_TENSOR_USAGE_PARAMETER_QCOM;
      ICHECK(CL_TENSOR_USAGE_INVALID_QCOM == this->layer_.storage_map[nid].usage)
          << "Parameter have usage reservation !!!";
    }
    if (CL_TENSOR_USAGE_INVALID_QCOM != this->layer_.storage_map[nid].usage) {
      // Respect special reservation on usage.
      usage = this->layer_.storage_map[nid].usage;
    } else {
      this->layer_.storage_map[nid].usage = usage;
    }
    if (this->layer_.storage_map[nid].custom_layout) {
      // Respect special reservation on layout.
      layout = this->layer_.storage_map[nid].layout;
    } else {
      this->layer_.storage_map[nid].layout = layout;
    }

    auto clml_tensor = MakeCLMLTensorFromJSONNode(node, layout, usage, dtype, node_data, shape);

    this->layer_.storage_map[nid].tensor_desc = clml_tensor;
    this->layer_.storage_map[nid].usage = usage;
    this->layer_.storage_map[nid].layout = layout;
    LOG_CLML << "Storage Map Alloc:" << nid << " Name:" << node.GetOpName() << " Usage: " << usage
             << " Layout:" << layout;

    if ("input" == node.GetOpType()) {
      this->layer_.inputs.insert({nid, this->layer_.storage_map[nid].tensor_desc});
      // Input copy placeholder Tensor
      if (layout == CL_TENSOR_LAYOUT_OPTIMAL_QCOM) {
        this->layer_.in_placeholder.insert(
            {nid, MakeCLMLTensorFromJSONNode(node, CL_TENSOR_LAYOUT_NCHW_QCOM, usage, dtype,
                                             node_data, shape)});
      } else {
        this->layer_.in_placeholder.insert(
            {nid, MakeCLMLTensorFromJSONNode(node, layout, usage, dtype, node_data, shape)});
      }
    }
    return clml_tensor;
  }

  /*!
   * \brief Build CLML layer from JSON representation and cache.
   *
   * \note For the time being only one layer or operator is supported
   * per engine.
   */
  void BuildEngine() {
    size_t nid;
    // Create tensors for the operators which has distinct layout format
    // other than CL_TENSOR_LAYOUT_OPTIMAL_QCOM.
    for (nid = 0; nid < nodes_.size(); ++nid) {
      const auto& node = nodes_[nid];
      if ("nn.dense" == node.GetOpName()) CreateDenseLayerTensor(&layer_, node, nid);
      if ("nn.batch_matmul" == node.GetOpName()) CreateBatchMatmulLayerTensor(&layer_, node, nid);
      if ("nn.softmax" == node.GetOpName() || PatternMatch(node.GetOpName(), "nn.softmax"))
        CreateSoftmaxLayerTensor(&layer_, node, nid);
    }

    for (nid = 0; nid < nodes_.size(); ++nid) {
      const auto& node = nodes_[nid];
      if (node.GetOpType() == "input") {
        // Layers may request for different layout. Differ the input allocation.
      } else if (node.GetOpType() == "kernel") {
        auto op_name = node.GetOpName();
        if (PatternMatch(op_name, "nn.conv2d") || PatternMatch(op_name, "nn.pad_conv2d"))
          CreateConvolution2DLayer(&layer_, node, CL_CONVOLUTION_MODE_CONVOLUTION_QCOM, nid);
        else if (PatternMatch(op_name, "nn.depthwise_conv2d"))
          CreateConvolution2DLayer(&layer_, node, CL_CONVOLUTION_MODE_DEPTHWISE_QCOM, nid);
        else if (PatternMatch(op_name, "nn.conv2d_transpose"))
          CreateConvolution2DLayer(&layer_, node, CL_CONVOLUTION_MODE_TRANSPOSE_QCOM, nid);
        else if ("nn.relu6" == op_name || PatternMatch(op_name, "nn.relu6"))
          CreateReLULayer(&layer_, node, nid, CL_ACTIVATION_RELU6);
        else if (PatternMatch(op_name, "nn.relu"))
          CreateReLULayer(&layer_, node, nid, CL_ACTIVATION_RELU);
        else if (PatternMatch(op_name, "nn.batch_norm"))
          CreateBatchNormLayer(&layer_, node, nid);
        else if ("nn.max_pool2d" == op_name || "nn.avg_pool2d" == op_name ||
                 "nn.l2_pool2d" == op_name || PatternMatch(op_name, "nn.max_pool2d") ||
                 PatternMatch(op_name, "nn.avg_pool2d"))
          CreatePoolingLayer(&layer_, node, nid);
        else if ("nn.global_max_pool2d" == op_name || "nn.global_avg_pool2d" == op_name ||
                 PatternMatch(op_name, "nn.global_avg_pool2d") ||
                 PatternMatch(op_name, "nn.global_max_pool2d"))
          CreateGlobalPoolingLayer(&layer_, node, nid);
        else if ("reshape" == op_name || PatternMatch(op_name, "reshape"))
          CreateReshapeLayer(&layer_, node, nid);
        else if ("concatenate" == op_name)
          CreateConcatLayer(&layer_, node, nid);
        else if ("nn.dense" == op_name)
          CreateDenseLayer(&layer_, node, nid);
        else if ("nn.softmax" == op_name || PatternMatch(op_name, "nn.softmax"))
          CreateSoftMaxLayer(&layer_, node, nid);
        else if ("nn.pad" == op_name)
          CreatePadLayer(&layer_, node, nid);
        else if ("nn.batch_flatten" == op_name)
          CreateBatchFlattenLayer(&layer_, node, nid);
        else if ("clip" == op_name)
          CreateClipLayer(&layer_, node, nid);
        else if ("add" == op_name || "subtract" == op_name || "multiply" == op_name ||
                 "minimum" == op_name || "maximum" == op_name || "divide" == op_name ||
                 PatternMatch(op_name, "relax.add") || PatternMatch(op_name, "relax.subtract") ||
                 PatternMatch(op_name, "relax.multiply") ||
                 PatternMatch(op_name, "relax.minimum") || PatternMatch(op_name, "relax.maximum") ||
                 PatternMatch(op_name, "relax.divide"))
          CreateBinaryLayer(&layer_, node, nid);
        else if ("nn.depth_to_space" == op_name)
          CreateDepthToSpaceLayer(&layer_, node, nid);
        else if ("nn.upsampling" == op_name)
          CreateResizeLayer(&layer_, node, nid);
        else if ("nn.batch_matmul" == op_name)
          CreateBatchMatmulLayer(&layer_, node, nid);
        else
          LOG(FATAL) << "Unsupported op: " << op_name;
        this->layer_.layer_names.push_back(op_name);
        // Keep map of function and Node to use in profiling
        this->layer_.op_node_map.insert({this->layer_.function.back(), std::make_pair(nid, node)});
      } else if (node.GetOpType() != "const") {
        LOG(WARNING) << "Build Engine: Unknown Node:" << node.GetOpType();
      }
    }

    for (size_t i = 0; i < outputs_.size(); ++i) {
      nid = outputs_[i].id_;
      DLDataType tvm_dtype = nodes_[nid].GetOpDataType()[0];
      cl_channel_type cl_dtype = MakeCLDataType(tvm_dtype);
      this->layer_.outputs.push_back(this->layer_.storage_map[nid].tensor_desc);
      if (this->layer_.out_shapes.find(nid) != this->layer_.out_shapes.end()) {
        // Handle customized shapes here
        this->layer_.out_placeholder.push_back(MakeCLMLTensorFromJSONNode(
            nodes_[nid], CL_TENSOR_LAYOUT_NCHW_QCOM, CL_TENSOR_USAGE_CNN_QCOM, cl_dtype, nullptr,
            this->layer_.out_shapes[nid]));
      } else {
        this->layer_.out_placeholder.push_back(MakeCLMLTensorFromJSONNode(
            nodes_[nid], CL_TENSOR_LAYOUT_NCHW_QCOM, CL_TENSOR_USAGE_CNN_QCOM, cl_dtype));
      }
    }

    // Plan memory utilization
    PlanMemory();

    // ALlocate device memories and initialize the params if any
    cl_int result = 0;
    size_t alloc_on_chip = 0;
    size_t alloc_ddr = 0;
    size_t alloc_ddr_reuse = 0;
    for (auto it = this->layer_.storage_map.begin(); it != this->layer_.storage_map.end(); it++) {
      auto tensor_desc = it->second.tensor_desc;
      uint32_t mem_size = 0;
      result = CL_OUT_OF_HOST_MEMORY;
      CLML_CALL(clGetMLTensorMemorySizeQCOM, CLML_CTX, tensor_desc->tensor, &mem_size);

      JSONGraphNode node = it->second.node;
      void* node_data = nullptr;
      size_t on_chip_mem_offset = -1;
      if (layer_.on_chip_alloc_plan.find(it->first) != layer_.on_chip_alloc_plan.end()) {
        LOG_MEM << "Found GMEM Alloc:" << it->first
                << " Size:" << layer_.on_chip_alloc_plan[it->first].first
                << " Offset:" << layer_.on_chip_alloc_plan[it->first].second;
        on_chip_mem_offset = layer_.on_chip_alloc_plan[it->first].second;
        alloc_on_chip += mem_size;
        tensor_desc->memory = AllocateOnChipTensorMemory(mem_size, on_chip_mem_offset);
      } else if (layer_.ddr_alloc_plan.find(it->first) != layer_.ddr_alloc_plan.end()) {
        LOG_MEM << "DDR Alloc for nid:" << it->first << " Type:" << node.GetOpType();
        tensor_desc->memory = layer_.ddr_alloc_plan[it->first];
        alloc_ddr_reuse += mem_size;
        //} else if ((node.GetOpType() == "input") || IsOutputTensor(it->first) || (node.GetOpType()
        //== "const")) {
      } else if (node.GetOpType() == "const") {
        LOG_MEM << "DDR Alloc for Const/Input/Output";
        tensor_desc->memory = AllocateDDRTensorMemory(mem_size);
        alloc_ddr += mem_size;
      } else {
        LOG(FATAL) << "Mem allocation not found on DDR as well as On-Chip nid: " << it->first
                   << " Type:" << node.GetOpType();
      }

      if (node.GetOpType() == "const") {
        node_data = data_entry_[EntryID(it->first, 0)]->data;
        if (node_data != nullptr) {
          CopyDataToCLMLTensor(tensor_desc, node_data);
        }
      }
      this->layer_.tensorMemDescs.push_back(*tensor_desc);
    }
    LOG_STATS << "Total On-Chip Allocation  :" << alloc_on_chip;
    LOG_STATS << "Total DDR Reuse Allocation:" << alloc_ddr_reuse;
    LOG_STATS << "Total DDR fixed allocation:" << alloc_ddr;
    size_t ddr_global_pool = 0;
    size_t ddr_local_pool = 0;
    auto cws = CLMLWorkspace::Global();
    for (auto it = cws->ddr_global_pool.begin(); it != cws->ddr_global_pool.end(); it++) {
      LOG_STATS << "DDR Global pool - size:" << it->second.first << " Ref:" << it->second.second;
      ddr_global_pool += it->second.first;
    }
    LOG_STATS << "Total Global Pool:" << ddr_global_pool;
    for (auto it = this->layer_.ddr_storage_ref_map.begin();
         it != this->layer_.ddr_storage_ref_map.end(); it++) {
      LOG_STATS << "DDR Local pool - size:" << it->second.first << " Ref cnt:" << it->second.second;
      ddr_local_pool += it->second.first;
    }
    LOG_STATS << "Total Local Pool:" << ddr_local_pool;

    // Setup descriptor set
    CLML_CALL(clCreateMLTensorMemoryDescriptorSetQCOM, &this->layer_.descriptorSet);

    CLML_CALL(clUpdateMLTensorMemoryDescriptorSetQCOM, this->layer_.descriptorSet,
              static_cast<uint32_t>(this->layer_.tensorMemDescs.size()),
              this->layer_.tensorMemDescs.data());

    if (cws->is_tuning_run) {
      LOG_CLML << "CLML Tunning In Progress:";
      // Let the command queue recreated in profiling mode.
      cl::OpenCLWorkspace::Global()->EnableQueueProfiling(cws->tentry->device, true);
      for (size_t i = 0; i < this->layer_.function.size(); ++i) {
        LOG_CLML << "CLML Tunning:" << this->layer_.layer_names[i];
        CLML_CALL(clTuneMLOpQCOM, CLML_QUEUE, this->layer_.function[i], this->layer_.descriptorSet,
                  this->layer_.tuning_cache, nullptr);
      }
      cl::OpenCLWorkspace::Global()->EnableQueueProfiling(cws->tentry->device, false);

      size_t cache_len_bytes = 0;
      size_t len_ret = 0;
      CLML_CALL(clSaveMLTuningCacheQCOM, layer_.tuning_cache, 0, nullptr, &cache_len_bytes);

      std::vector<unsigned char> saved_cache(cache_len_bytes, 0);
      CLML_CALL(clSaveMLTuningCacheQCOM, layer_.tuning_cache, saved_cache.size(),
                saved_cache.data(), &len_ret);

      std::string tune_str;
      dmlc::MemoryStringStream mstrm(&tune_str);
      dmlc::Stream* strm = &mstrm;
      uint64_t header = kTVMCLMLTuningCacheMagic;
      uint64_t reserved = 0x0;
      strm->Write(header);
      strm->Write(reserved);
      strm->Write(clml_symbol);
      strm->Write(saved_cache);

      std::ofstream fs(cws->tuning_file, std::ios::app | std::ios::binary);
      ICHECK(!fs.fail()) << "Cannot open " << cws->tuning_file;
      fs.write(&tune_str[0], tune_str.length());
      LOG_CLML << "CLML: Tuning cache dumped to:" << cws->tuning_file << " size"
               << tune_str.length() << " with tuning blob len " << saved_cache.size();
    }
    if (cws->is_recordable_queue) {
      for (size_t i = 0; i < this->layer_.function.size(); ++i) {
        CLML_CALL(clEnqueueMLOpQCOM, this->layer_.recordable_queue, this->layer_.function[i],
                  this->layer_.descriptorSet, 0, nullptr, nullptr);
      }

      result = clEndRecordingQCOM(this->layer_.recording);
      ICHECK(result == CL_SUCCESS) << "clEndRecordingQCOM:" << result;
    }
  }

  /*!
   * \brief Create a 2D convolution layer.
   *
   * \param layer The CLML layer to build. Containing inputs, outputs and the CLML function.
   * \param node The JSON representation of the operator.
   * \param mode The conv2d mode type - CL_CONVOLUTION_MODE_CONVOLUTION_QCOM
   *                                    or CL_CONVOLUTION_MODE_DEPTHWISE_QCOM
   *                                    or CL_CONVOLUTION_MODE_TRANSPOSE_QCOM.
   * \param nid The node index of JSON graph node, which points to this operator.
   */
  void CreateConvolution2DLayer(CachedLayer* layer, const JSONGraphNode& node,
                                cl_convolution_mode_qcom mode, size_t nid) {
    std::vector<std::string> padding = node.GetAttr<std::vector<std::string>>("padding");
    std::vector<std::string> strides = node.GetAttr<std::vector<std::string>>("strides");
    std::vector<std::string> dilation = node.GetAttr<std::vector<std::string>>("dilation");
    std::vector<cl_uint> clml_padding = GetVectorValues(padding);

    DLDataType tvm_dtype = node.GetOpDataType()[0];
    cl_channel_type cl_dtype = MakeCLDataType(tvm_dtype);
    cl_arithmetic_mode_qcom cl_arithmetic_mode = MakeCLArithMode(cl_dtype, cl_dtype);
    if (!node.HasAttr("padding")) {
      clml_padding.resize(4);
      std::fill(clml_padding.begin(), clml_padding.end(), 0);
    }

    cl_uint clml_padding_b[CL_ML_TENSOR_MAX_SPATIAL_DIMS_QCOM] = {clml_padding[0], clml_padding[1]};
    cl_uint clml_padding_a[CL_ML_TENSOR_MAX_SPATIAL_DIMS_QCOM] = {clml_padding[2], clml_padding[3]};
    std::vector<cl_uint> v_strides = GetVectorValues(strides);
    std::vector<cl_uint> v_dilation = GetVectorValues(dilation);
    cl_uint clml_strides[CL_ML_TENSOR_MAX_SPATIAL_DIMS_QCOM] = {v_strides[0], v_strides[1]};
    cl_uint clml_dilation[CL_ML_TENSOR_MAX_SPATIAL_DIMS_QCOM] = {v_dilation[0], v_dilation[1]};

    cl_uint groups = std::stoi(node.GetAttr<std::vector<std::string>>("groups")[0]);
    if (CL_CONVOLUTION_MODE_CONVOLUTION_QCOM == mode) {
      ICHECK(groups == 1) << "CLML convolution only supports group size of 1.";
    } else {
      groups = 1;  // Don't need to pass groups to depthwise
    }

    bool has_act = false;
    std::string activation_type;
    cl_activation_function_qcom clml_act_type = CL_ACTIVATION_RELU;
    if (node.HasAttr("activation_type")) {
      activation_type = node.GetAttr<std::vector<std::string>>("activation_type")[0];
      ICHECK(activation_type == "relu" || activation_type == "relu6")
          << "Unknown activation type:" << activation_type;
      if (activation_type == "relu") {
        clml_act_type = CL_ACTIVATION_RELU;
      } else {
        clml_act_type = CL_ACTIVATION_RELU6;
      }
      has_act = true;
    }
    cl_ml_op_activation_desc_qcom act_desc = {clml_act_type, CL_PROPAGATE_NAN_QCOM,
                                              cl_arithmetic_mode};

    // Collect inputs and outputs, handling nn.conv2d.
    std::vector<JSONGraphNodeEntry> inputs = node.GetInputs();
    size_t num_inputs = inputs.size();
    bool has_bias;
    bool has_bn;
    ICHECK(num_inputs >= 2 && num_inputs <= 7)
        << "Batchnorm fused convolution requires max 7 arguments";
    has_bias = (num_inputs == 3) || (num_inputs == 7);
    has_bn = (num_inputs == 6) || (num_inputs == 7);
    // Input
    auto input =
        MakeCLMLTensorFromJSONEntry(inputs[0].id_, {}, CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype);
    // Weight
    auto weight =
        MakeCLMLTensorFromJSONEntry(inputs[1].id_, {}, CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype);
    // Bias
    auto bias = std::make_shared<cl_ml_tensor_memory_desc_qcom>();
    if (has_bias) {
      bias =
          MakeCLMLTensorFromJSONEntry(inputs[2].id_, {}, CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype);
    } else {
      cl_ml_tensor_desc_qcom desc = {};
      desc.num_dimensions = CL_TENSOR_UNUSED_QCOM;
      CLML_CALL_clCreateMLTensorQCOM(CLML_CTX, nullptr, &desc, CL_TENSOR_USAGE_UNUSED_QCOM,
                                     &layer_.unusedTensor);
      ICHECK(layer_.unusedTensor) << "clCreateMLTensorQCOM: unusedTensor";
      bias->tensor = layer_.unusedTensor;
    }
    // Output
    auto output = MakeCLMLTensorFromJSONEntry(nid, {}, CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype);
    cl_ml_op_convolution_desc_qcom conv_desc{mode,
                                             groups,
                                             4,
                                             {clml_padding_b[0], clml_padding_b[1]},
                                             {clml_padding_a[0], clml_padding_a[1]},
                                             {clml_strides[0], clml_strides[1]},
                                             {clml_dilation[0], clml_dilation[1]},
                                             0,
                                             cl_arithmetic_mode};

    cl_ml_op_qcom op = nullptr;
    if (!has_bn) {
      if (!has_act) {
        CLML_CALL(clCreateMLOpConvolutionForwardQCOM, CLML_CTX, nullptr, &conv_desc, input->tensor,
                  weight->tensor, bias->tensor, output->tensor, &op, nullptr);
      } else {
        CLML_CALL(clCreateMLOpFusedConvolutionActivationForwardQCOM, CLML_CTX, nullptr, &conv_desc,
                  &act_desc, input->tensor, weight->tensor, bias->tensor, nullptr, output->tensor,
                  &op, layer_.tuning_cache);
      }
      layer->function.push_back(op);
    } else {
      int bn_index = has_bias ? 3 : 2;
      int axis = std::stoi(node.GetAttr<std::vector<std::string>>("batchnorm")[0]);
      auto bn_dims = GetTensorDims(nodes_[inputs[bn_index].id_]);
      float epsilon = std::stof(node.GetAttr<std::vector<std::string>>("batchnorm")[1]);

      std::vector<cl_ml_op_properties_qcom> opProperties;
      opProperties.push_back(CL_ML_BATCH_NORM_OP_EPSILON_QCOM);
      opProperties.push_back(*reinterpret_cast<cl_ml_op_properties_qcom*>(&epsilon));
      opProperties.push_back(CL_ML_OP_PROPERTY_LIST_END_QCOM);
      std::vector<size_t> bn_shape = {1, 1, 1, 1};
      bn_shape[axis] = bn_dims.n;
      auto bn_mean = std::make_shared<cl_ml_tensor_memory_desc_qcom>();
      auto bn_var = std::make_shared<cl_ml_tensor_memory_desc_qcom>();
      auto bn_scale = std::make_shared<cl_ml_tensor_memory_desc_qcom>();
      auto bn_bias = std::make_shared<cl_ml_tensor_memory_desc_qcom>();
      bn_scale = MakeCLMLTensorFromJSONEntry(inputs[bn_index].id_, bn_shape,
                                             CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype);
      bn_bias = MakeCLMLTensorFromJSONEntry(inputs[bn_index + 1].id_, bn_shape,
                                            CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype);
      bn_mean = MakeCLMLTensorFromJSONEntry(inputs[bn_index + 2].id_, bn_shape,
                                            CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype);
      bn_var = MakeCLMLTensorFromJSONEntry(inputs[bn_index + 3].id_, bn_shape,
                                           CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype);

      cl_ml_op_batchnorm_desc_qcom bn_desc = {CL_BATCHNORM_MODE_SPATIAL_QCOM, cl_arithmetic_mode};
      if (!has_act) {
        CLML_CALL(clCreateMLOpFusedConvolutionBatchNormForwardQCOM, CLML_CTX, opProperties.data(),
                  &conv_desc, &bn_desc, input->tensor, weight->tensor, bias->tensor, output->tensor,
                  bn_mean->tensor, bn_var->tensor, bn_scale->tensor, bn_bias->tensor, &op,
                  layer_.tuning_cache);
      } else {
        CLML_CALL(clCreateMLOpFusedConvolutionBatchNormActivationForwardQCOM, CLML_CTX,
                  opProperties.data(), &conv_desc, &bn_desc, &act_desc, input->tensor,
                  weight->tensor, bias->tensor, output->tensor, nullptr, bn_mean->tensor,
                  bn_var->tensor, bn_scale->tensor, bn_bias->tensor, &op, layer_.tuning_cache);
      }
      layer->function.push_back(op);
    }
    return;
  }

  /*!
   * \brief Create a ReLU(X) layer.
   *
   * \param layer The CLML layer to build. Containing inputs, outputs and the CLML output.
   * \param node The JSON representation of the operator.
   * \param nid The node index of JSON graph node, which points to this operator.
   */
  void CreateReLULayer(CachedLayer* layer, const JSONGraphNode& node, size_t nid,
                       cl_activation_function_qcom clml_act_type = CL_ACTIVATION_RELU) {
    cl_ml_op_qcom op = nullptr;
    DLDataType tvm_dtype = node.GetOpDataType()[0];
    cl_channel_type cl_dtype = MakeCLDataType(tvm_dtype);
    cl_arithmetic_mode_qcom cl_arithmetic_mode = MakeCLArithMode(cl_dtype, cl_dtype);
    auto input = MakeCLMLTensorFromJSONEntry(node.GetInputs()[0].id_, {},
                                             CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype);
    auto output = MakeCLMLTensorFromJSONEntry(nid, {}, CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype);

    cl_ml_op_activation_desc_qcom act_desc = {clml_act_type, CL_PROPAGATE_NAN_QCOM,
                                              cl_arithmetic_mode};

    cl_ml_tensor_desc_qcom desc = {};
    desc.num_dimensions = CL_TENSOR_UNUSED_QCOM;
    CLML_CALL_clCreateMLTensorQCOM(CLML_CTX, nullptr, &desc, CL_TENSOR_USAGE_UNUSED_QCOM,
                                   &layer_.unusedTensor);
    ICHECK(layer_.unusedTensor) << "clCreateMLTensorQCOM: unusedTensor";

    CLML_CALL(clCreateMLOpActivationForwardQCOM, CLML_CTX, nullptr, &act_desc, input->tensor,
              layer_.unusedTensor, output->tensor, &op, layer_.tuning_cache);
    ICHECK(op) << "Activation Error";

    layer->function.push_back(op);
    return;
  }

  /*!
   * \brief Create a batch norm layer.
   *
   *
   * \param layer The CLML layer to build. Containing inputs, outputs and the CLML function.
   * \param node The JSON representation of the operator.
   * \param nid The node index of JSON graph node, which points to this operator.
   */
  void CreateBatchNormLayer(CachedLayer* layer, const JSONGraphNode& node, size_t nid) {
    cl_ml_op_qcom op = nullptr;
    DLDataType tvm_dtype = node.GetOpDataType()[0];
    cl_channel_type cl_dtype = MakeCLDataType(tvm_dtype);
    cl_arithmetic_mode_qcom cl_arithmetic_mode = MakeCLArithMode(cl_dtype, cl_dtype);
    auto input = MakeCLMLTensorFromJSONEntry(node.GetInputs()[0].id_, {},
                                             CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype);
    int axis = std::stoi(node.GetAttr<std::vector<std::string>>("axis")[0]);
    float epsilon = std::stof(node.GetAttr<std::vector<std::string>>("epsilon")[0]);

    std::vector<cl_ml_op_properties_qcom> opProperties;
    opProperties.push_back(CL_ML_BATCH_NORM_OP_EPSILON_QCOM);
    opProperties.push_back(*reinterpret_cast<cl_ml_op_properties_qcom*>(&epsilon));
    opProperties.push_back(CL_ML_OP_PROPERTY_LIST_END_QCOM);

    auto bn_dims = GetTensorDims(nodes_[node.GetInputs()[1].id_]);
    std::vector<size_t> bn_shape = {1, 1, 1, 1};
    bn_shape[axis] = bn_dims.n;
    auto bn_mean = std::make_shared<cl_ml_tensor_memory_desc_qcom>();
    auto bn_var = std::make_shared<cl_ml_tensor_memory_desc_qcom>();
    auto bn_scale = std::make_shared<cl_ml_tensor_memory_desc_qcom>();
    auto bn_bias = std::make_shared<cl_ml_tensor_memory_desc_qcom>();
    bn_scale = MakeCLMLTensorFromJSONEntry(node.GetInputs()[1].id_, bn_shape,
                                           CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype);
    bn_bias = MakeCLMLTensorFromJSONEntry(node.GetInputs()[2].id_, bn_shape,
                                          CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype);
    bn_mean = MakeCLMLTensorFromJSONEntry(node.GetInputs()[3].id_, bn_shape,
                                          CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype);
    bn_var = MakeCLMLTensorFromJSONEntry(node.GetInputs()[4].id_, bn_shape,
                                         CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype);

    auto output = MakeCLMLTensorFromJSONEntry(nid, {}, CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype);

    cl_ml_op_batchnorm_desc_qcom bn_desc = {CL_BATCHNORM_MODE_SPATIAL_QCOM, cl_arithmetic_mode};

    CLML_CALL(clCreateMLOpBatchNormForwardQCOM, CLML_CTX, opProperties.data(), &bn_desc,
              input->tensor, bn_mean->tensor, bn_var->tensor, bn_scale->tensor, bn_bias->tensor,
              output->tensor, &op, layer_.tuning_cache);
    ICHECK(op) << "Batchnorm Error";

    layer->function.push_back(op);
    return;
  }

  /*!
   * \brief Create a creating pooling layer.
   *
   * \note Currently global_max_pool2d and global_avg_pool2d are supported.
   *
   * \param layer The CLML layer to build. Containing inputs, outputs and the CLML function.
   * \param node The JSON representation of the operator.
   * \param nid The node index of JSON graph node, which points to this operator.
   */
  void CreatePoolingLayer(CachedLayer* layer, const JSONGraphNode& node, size_t nid) {
    cl_ml_op_qcom op = nullptr;
    DLDataType tvm_dtype = node.GetOpDataType()[0];
    cl_channel_type cl_dtype = MakeCLDataType(tvm_dtype);
    cl_arithmetic_mode_qcom cl_arithmetic_mode = MakeCLArithMode(cl_dtype, cl_dtype);
    auto input = MakeCLMLTensorFromJSONEntry(node.GetInputs()[0].id_, {},
                                             CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype);
    auto output = MakeCLMLTensorFromJSONEntry(nid, {}, CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype);

    std::vector<std::string> windows = node.GetAttr<std::vector<std::string>>("pool_size");
    std::vector<std::string> strides = node.GetAttr<std::vector<std::string>>("strides");
    std::vector<std::string> padding = node.GetAttr<std::vector<std::string>>("padding");
    std::vector<cl_uint> clml_window = GetVectorValues(windows);
    std::vector<cl_uint> clml_stride = GetVectorValues(strides);
    std::vector<cl_uint> clml_padding = GetVectorValues(padding);

    cl_ml_op_pooling_desc_qcom pool_desc = {
        ((node.GetOpName() == "nn.max_pool2d") || PatternMatch(node.GetOpName(), "nn.max_pool2d"))
            ? CL_POOLING_MODE_MAX_QCOM
            : CL_POOLING_MODE_AVERAGE_EXCLUDE_PADDING_QCOM,
        4,  // reserved
        {clml_padding[0], clml_padding[1]},
        {clml_padding[2], clml_padding[3]},
        {clml_stride[0], clml_stride[1]},
        {clml_window[0], clml_window[1]},
        CL_PROPAGATE_NAN_QCOM,
        cl_arithmetic_mode,
    };

    cl_ml_tensor_desc_qcom desc = {};
    cl_ml_tensor_qcom unusedTensor = nullptr;
    desc.num_dimensions = CL_TENSOR_UNUSED_QCOM;
    CLML_CALL_clCreateMLTensorQCOM(CLML_CTX, nullptr, &desc, CL_TENSOR_USAGE_UNUSED_QCOM,
                                   &unusedTensor);
    ICHECK(unusedTensor) << "clCreateMLTensorQCOM: unusedTensor";

    CLML_CALL(clCreateMLOpPoolingForwardQCOM, CLML_CTX, nullptr, &pool_desc, input->tensor,
              unusedTensor, output->tensor, &op, layer_.tuning_cache);
    ICHECK(op) << "Pooling Error";

    layer->function.push_back(op);
    return;
  }

  /*!
   * \brief Create a global pooling layer.
   *
   * \note Currently global_max_pool2d and global_avg_pool2d are supported.
   *
   * \param layer The CLML layer to build. Containing inputs, outputs and the CLML function.
   * \param node The JSON representation of the operator.
   * \param nid The node index of JSON graph node, which points to this operator.
   */
  void CreateGlobalPoolingLayer(CachedLayer* layer, const JSONGraphNode& node, size_t nid) {
    cl_ml_op_qcom op = nullptr;
    DLDataType tvm_dtype = node.GetOpDataType()[0];
    cl_channel_type cl_dtype = MakeCLDataType(tvm_dtype);
    cl_arithmetic_mode_qcom cl_arithmetic_mode = MakeCLArithMode(cl_dtype, cl_dtype);
    auto input = MakeCLMLTensorFromJSONEntry(node.GetInputs()[0].id_, {},
                                             CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype);
    auto output = MakeCLMLTensorFromJSONEntry(nid, {}, CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype);
    auto in_dims = GetTensorDims(nodes_[node.GetInputs()[0].id_]);
    cl_ml_op_pooling_desc_qcom pool_desc = {
        ((node.GetOpName() == "nn.global_max_pool2d") ||
         PatternMatch(node.GetOpName(), "nn.global_max_pool2d"))
            ? CL_POOLING_MODE_MAX_QCOM
            : CL_POOLING_MODE_AVERAGE_EXCLUDE_PADDING_QCOM,
        4,  // reserved
        {0, 0},
        {0, 0},
        {1, 1},
        {in_dims.w, in_dims.h},
        CL_PROPAGATE_NAN_QCOM,
        cl_arithmetic_mode,
    };

    cl_ml_tensor_desc_qcom desc = {};
    desc.num_dimensions = CL_TENSOR_UNUSED_QCOM;
    CLML_CALL_clCreateMLTensorQCOM(CLML_CTX, nullptr, &desc, CL_TENSOR_USAGE_UNUSED_QCOM,
                                   &layer_.unusedTensor);
    ICHECK(layer_.unusedTensor) << "clCreateMLTensorQCOM: unusedTensor";

    CLML_CALL(clCreateMLOpPoolingForwardQCOM, CLML_CTX, nullptr, &pool_desc, input->tensor,
              layer_.unusedTensor, output->tensor, &op, layer_.tuning_cache);
    ICHECK(op) << "Pooling Error";

    layer->function.push_back(op);
    return;
  }

  /*!
   * \brief Create a Softmax layer Tensors with supported layout.
   * \param layer The CLML layer to build. Containing inputs, outputs and the CLML function.
   * \param node The JSON representation of the operator.
   * \param nid The node index of JSON graph node, which points to this operator.
   */
  void CreateSoftmaxLayerTensor(CachedLayer* layer, const JSONGraphNode& node, size_t nid) {
    cl_ml_tensor_layout_qcom layout;
    DLDataType tvm_dtype = node.GetOpDataType()[0];
    cl_channel_type cl_dtype = MakeCLDataType(tvm_dtype);
    auto out_dims = GetTensorDims(nodes_[node.GetInputs()[0].id_]);
    int axis = std::stoi(node.GetAttr<std::vector<std::string>>("axis")[0]);
    // enabling  NHWC layout && NCHW layout for 4D,  basis the axis value
    if (out_dims.h >= 1 && out_dims.w >= 1) {
      if (axis == 3 || axis == -1) {
        layout = CL_TENSOR_LAYOUT_NHWC_QCOM;
      } else {
        layout = CL_TENSOR_LAYOUT_NCHW_QCOM;
      }
    } else {  // default layout for 2D
      layout = CL_TENSOR_LAYOUT_OPTIMAL_QCOM;
    }
    auto output = MakeCLMLTensorFromJSONEntry(nid, {}, layout, cl_dtype);
    auto input = MakeCLMLTensorFromJSONEntry(node.GetInputs()[0].id_, {}, layout, cl_dtype);

    return;
  }

  /*!
   * \brief Create a SoftMax layer.
   *
   * \param layer The CLML layer to build. Containing inputs, outputs and the CLML output.
   * \param node The JSON representation of the operator.
   * \param nid The node index of JSON graph node, which points to this operator.
   */
  void CreateSoftMaxLayer(CachedLayer* layer, const JSONGraphNode& node, size_t nid) {
    cl_ml_tensor_layout_qcom layout = CL_TENSOR_LAYOUT_OPTIMAL_QCOM;
    cl_softmax_mode_qcom mode = CL_SOFTMAX_MODE_SPATIAL_QCOM;
    cl_ml_op_qcom op = nullptr;
    DLDataType tvm_dtype = node.GetOpDataType()[0];
    cl_channel_type cl_dtype = MakeCLDataType(tvm_dtype);
    cl_arithmetic_mode_qcom cl_arithmetic_mode = MakeCLArithMode(cl_dtype, cl_dtype);
    auto output = MakeCLMLTensorFromJSONEntry(nid, {}, layout, cl_dtype);
    auto input = MakeCLMLTensorFromJSONEntry(node.GetInputs()[0].id_, {}, layout, cl_dtype);
    cl_ml_op_softmax_desc_qcom softmax_desc = {CL_SOFTMAX_ALGORITHM_ACCURATE_QCOM, mode,
                                               cl_arithmetic_mode};
    CLML_CALL(clCreateMLOpSoftmaxQCOM, CLML_CTX, nullptr, &softmax_desc, input->tensor,
              output->tensor, &op, layer_.tuning_cache);
    ICHECK(op) << "SoftMax Error";
    layer->function.push_back(op);
    return;
  }

  /*!
   * \brief Create a Pad layer.
   *
   * \param layer The CLML layer to build. Containing inputs, outputs and the CLML output.
   * \param node The JSON representation of the operator.
   * \param nid The node index of JSON graph node, which points to this operator.
   */
  void CreatePadLayer(CachedLayer* layer, const JSONGraphNode& node, size_t nid) {
    cl_ml_op_qcom op = nullptr;
    DLDataType tvm_dtype = node.GetOpDataType()[0];
    cl_channel_type cl_dtype = MakeCLDataType(tvm_dtype);
    cl_arithmetic_mode_qcom cl_arithmetic_mode = MakeCLArithMode(cl_dtype, cl_dtype);
    auto input = MakeCLMLTensorFromJSONEntry(node.GetInputs()[0].id_, {},
                                             CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype);
    auto output = MakeCLMLTensorFromJSONEntry(nid, {}, CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype);

    std::string pad_mode = node.GetAttr<std::vector<std::string>>("pad_mode")[0];
    std::vector<std::string> padding = node.GetAttr<std::vector<std::string>>("pad_width");
    std::vector<cl_uint> clml_padding = GetVectorValues(padding);

    cl_pad_mode_qcom clml_pad_mode = CL_PAD_MODE_CONSTANT_QCOM;
    if (pad_mode == "constant")
      clml_pad_mode = CL_PAD_MODE_CONSTANT_QCOM;
    else if (pad_mode == "edge")
      clml_pad_mode = CL_PAD_MODE_SYMMETRIC_QCOM;
    else if (pad_mode == "reflect")
      clml_pad_mode = CL_PAD_MODE_REFLECT_QCOM;
    else
      LOG(FATAL) << "Padding mode not supported by CLML:" << pad_mode;

    cl_ml_op_pad_desc_qcom pad_desc{
        clml_pad_mode,
        {0, 0},
        {clml_padding[0], clml_padding[1], clml_padding[2], clml_padding[3], 0, 0, 0, 0},
        cl_arithmetic_mode};

    CLML_CALL(clCreateMLOpPadQCOM, CLML_CTX, nullptr, &pad_desc, input->tensor, output->tensor, &op,
              layer_.tuning_cache);
    ICHECK(op) << "Pad Error";

    layer->function.push_back(op);
    return;
  }

  /*!
   * \brief Create a Batch Flatten layer.
   *
   * \param layer The CLML layer to build. Containing inputs, outputs and the CLML output.
   * \param node The JSON representation of the operator.
   * \param nid The node index of JSON graph node, which points to this operator.
   */
  void CreateBatchFlattenLayer(CachedLayer* layer, const JSONGraphNode& node, size_t nid) {
    cl_ml_op_qcom op = nullptr;
    DLDataType tvm_dtype = node.GetOpDataType()[0];
    cl_channel_type cl_dtype = MakeCLDataType(tvm_dtype);
    auto input = MakeCLMLTensorFromJSONEntry(node.GetInputs()[0].id_, {},
                                             CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype);
    auto output = MakeCLMLTensorFromJSONEntry(nid, {}, CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype);

    CLML_CALL(clCreateMLOpReshapeQCOM, CLML_CTX, nullptr, input->tensor, output->tensor, &op,
              layer_.tuning_cache);
    ICHECK(op) << "Reshape Error";

    layer->function.push_back(op);
    return;
  }

  /*!
   * \brief Create a Reshape layer.
   *
   * \param layer The CLML layer to build. Containing inputs, outputs and the CLML output.
   * \param node The JSON representation of the operator.
   * \param nid The node index of JSON graph node, which points to this operator.
   */
  void CreateReshapeLayer(CachedLayer* layer, const JSONGraphNode& node, size_t nid) {
    cl_ml_op_qcom op = nullptr;
    DLDataType tvm_dtype = node.GetOpDataType()[0];
    cl_channel_type cl_dtype = MakeCLDataType(tvm_dtype);
    auto input = MakeCLMLTensorFromJSONEntry(node.GetInputs()[0].id_, {},
                                             CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype);
    auto output = MakeCLMLTensorFromJSONEntry(nid, {}, CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype);

    CLML_CALL(clCreateMLOpReshapeQCOM, CLML_CTX, nullptr, input->tensor, output->tensor, &op,
              layer_.tuning_cache);
    ICHECK(op) << "Reshape Error";

    layer->function.push_back(op);
    return;
  }

  /*!
   * \brief Create a concat layer.
   *
   *
   * \param layer The CLML layer to build. Containing inputs, outputs and the CLML function.
   * \param node The JSON representation of the operator.
   * \param nid The node index of JSON graph node, which points to this operator.
   */
  void CreateConcatLayer(CachedLayer* layer, const JSONGraphNode& node, size_t nid) {
    cl_ml_op_qcom op = nullptr;
    std::vector<JSONGraphNodeEntry> input_ = node.GetInputs();
    DLDataType tvm_dtype = node.GetOpDataType()[0];
    cl_channel_type cl_dtype = MakeCLDataType(tvm_dtype);
    cl_arithmetic_mode_qcom cl_arithmetic_mode = MakeCLArithMode(cl_dtype, cl_dtype);
    int inputSize = input_.size();
    auto output = MakeCLMLTensorFromJSONEntry(nid, {}, CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype);
    cl_uint axis = std::stoi(node.GetAttr<std::vector<std::string>>("axis")[0]);
    cl_ml_tensor_qcom* concatInputs = new cl_ml_tensor_qcom[inputSize];
    for (int i = 0; i < inputSize; i++) {
      auto input = MakeCLMLTensorFromJSONEntry(node.GetInputs()[i].id_, {},
                                               CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype);
      concatInputs[i] = input->tensor;
    }
    cl_ml_op_concat_desc_qcom concatDesc = {axis, (cl_uint)inputSize, cl_arithmetic_mode};

    CLML_CALL(clCreateMLOpConcatQCOM, CLML_CTX, nullptr, &concatDesc, concatInputs, output->tensor,
              &op, layer_.tuning_cache);
    ICHECK(op) << "Concat Error";

    layer->function.push_back(op);

    delete[] concatInputs;
    return;
  }

  /*!
   * \brief Create a dense layer.
   *
   *
   * \param layer The CLML layer to build. Containing inputs, outputs and the CLML function.
   * \param node The JSON representation of the operator.
   * \param nid The node index of JSON graph node, which points to this operator.
   */
  void CreateDenseLayer(CachedLayer* layer, const JSONGraphNode& node, size_t nid) {
    cl_ml_op_qcom op = nullptr;
    DLDataType tvm_dtype = node.GetOpDataType()[0];
    cl_channel_type cl_dtype = MakeCLDataType(tvm_dtype);
    cl_arithmetic_mode_qcom cl_arithmetic_mode = MakeCLArithMode(cl_dtype, cl_dtype);
    size_t num_inputs = node.GetInputs().size();
    bool has_bias = (num_inputs == 3);
    auto in_dims = GetTensorDims(nodes_[node.GetInputs()[0].id_]);
    cl_ml_tensor_layout_qcom layout = CL_TENSOR_LAYOUT_NCHW_QCOM;
    bool is_vec_matmul = false;
    if (in_dims.n == 1 && has_bias) {
      layout = CL_TENSOR_LAYOUT_OPTIMAL_QCOM;
      is_vec_matmul = true;
    }

    auto input = MakeCLMLTensorFromJSONEntry(node.GetInputs()[0].id_, {}, layout, cl_dtype);
    auto wt_dims = GetTensorDims(nodes_[node.GetInputs()[1].id_]);
    auto weight = MakeCLMLTensorFromJSONEntry(node.GetInputs()[1].id_, {1, 1, wt_dims.n, wt_dims.c},
                                              layout, cl_dtype);
    auto output = MakeCLMLTensorFromJSONEntry(nid, {}, layout, cl_dtype);

    auto bias = std::make_shared<cl_ml_tensor_memory_desc_qcom>();
    if (has_bias) {
      bias = MakeCLMLTensorFromJSONEntry(node.GetInputs()[2].id_, {}, layout, cl_dtype);
    } else {
      cl_ml_tensor_desc_qcom desc = {};
      desc.num_dimensions = CL_TENSOR_UNUSED_QCOM;
      bias->tensor = layer_.unusedTensor;
    }

    if (is_vec_matmul) {
      cl_fc_weight_transform_qcom w_transform = CL_FC_WEIGHT_TRANSFORM_NONE_QCOM;
      if (in_dims.c == wt_dims.c) w_transform = CL_FC_WEIGHT_TRANSFORM_TRANSPOSE_QCOM;

      cl_ml_op_fully_connected_desc_qcom fc_desc{1,  // refer clml_ops.txt for struct
                                                 w_transform, cl_arithmetic_mode};

      CLML_CALL(clCreateMLOpFullyConnectedQCOM, CLML_CTX, nullptr, &fc_desc, input->tensor,
                weight->tensor, bias->tensor, output->tensor, &op, layer_.tuning_cache);
      ICHECK(op) << "FC layer Error";
      layer->function.push_back(op);
    } else {
      cl_gemm_transform_qcom b_transform = CL_GEMM_TRANSFORM_NONE_QCOM;
      if (in_dims.c == wt_dims.c) b_transform = CL_GEMM_TRANSFORM_TRANSPOSE_QCOM;

      cl_ml_op_gemm_desc_qcom gemmDesc = {in_dims.n,                    // m
                                          wt_dims.n,                    // n
                                          wt_dims.c,                    // k
                                          CL_GEMM_TRANSFORM_NONE_QCOM,  // A transform
                                          b_transform,                  // B transform
                                          {{1.0}, CL_FLOAT},            // alpha
                                          {{0.0}, CL_FLOAT},            // beta
                                          cl_arithmetic_mode};

      CLML_CALL(clCreateMLOpGemmQCOM, CLML_CTX, nullptr, &gemmDesc, input->tensor, weight->tensor,
                output->tensor, &op, layer_.tuning_cache);
      ICHECK(op) << "Gemm layer Error";
      layer->function.push_back(op);
      if (has_bias) {
        cl_ml_op_binary_desc_qcom binaryDesc = {CL_TENSOR_OP_ADD_QCOM,
                                                {{1.0}, CL_FLOAT},  // alpha
                                                {{1.0}, CL_FLOAT},  // beta
                                                {{1.0}, CL_FLOAT},  // gamma
                                                cl_arithmetic_mode};
        CLML_CALL(clCreateMLOpBinaryQCOM, CLML_CTX, nullptr, &binaryDesc, bias->tensor,
                  layer_.unusedTensor, output->tensor, &op, layer_.tuning_cache);
        ICHECK(op) << "Binary Op Error";
        layer->function.push_back(op);
      }
    }

    return;
  }

  /*!
   * \brief Create a dense layer Tensors with supported layout.
   *
   *
   * \param layer The CLML layer to build. Containing inputs, outputs and the CLML function.
   * \param node The JSON representation of the operator.
   * \param nid The node index of JSON graph node, which points to this operator.
   */
  void CreateDenseLayerTensor(CachedLayer* layer, const JSONGraphNode& node, size_t nid) {
    DLDataType tvm_dtype = node.GetOpDataType()[0];
    cl_channel_type cl_dtype = MakeCLDataType(tvm_dtype);
    auto in_dims = GetTensorDims(nodes_[node.GetInputs()[0].id_]);
    size_t num_inputs = node.GetInputs().size();
    bool has_bias = (num_inputs == 3);
    cl_ml_tensor_layout_qcom layout = CL_TENSOR_LAYOUT_NCHW_QCOM;
    if (in_dims.n == 1 && has_bias) {
      layout = CL_TENSOR_LAYOUT_OPTIMAL_QCOM;
    }
    auto input = MakeCLMLTensorFromJSONEntry(node.GetInputs()[0].id_, {}, layout, cl_dtype);
    auto wt_dims = GetTensorDims(nodes_[node.GetInputs()[1].id_]);
    auto weight = MakeCLMLTensorFromJSONEntry(node.GetInputs()[1].id_, {1, 1, wt_dims.n, wt_dims.c},
                                              layout, cl_dtype);
    auto output = MakeCLMLTensorFromJSONEntry(nid, {}, layout, cl_dtype);

    return;
  }

  /*!
   * \brief Create a batch_matmul layer.
   *
   *
   * \param layer The CLML layer to build. Containing inputs, outputs and the CLML function.
   * \param node The JSON representation of the operator.
   * \param nid The node index of JSON graph node, which points to this operator.
   */
  void CreateBatchMatmulLayer(CachedLayer* layer, const JSONGraphNode& node, size_t nid) {
    cl_ml_op_qcom op = nullptr;
    DLDataType tvm_dtype = node.GetOpDataType()[0];
    cl_channel_type cl_dtype = MakeCLDataType(tvm_dtype);
    cl_arithmetic_mode_qcom cl_arithmetic_mode = MakeCLArithMode(cl_dtype, cl_dtype);
    auto in_dims = GetTensorDims(nodes_[node.GetInputs()[0].id_]);
    auto input = MakeCLMLTensorFromJSONEntry(node.GetInputs()[0].id_, {in_dims.c, in_dims.h},
                                             CL_TENSOR_LAYOUT_NCHW_QCOM, cl_dtype);
    auto wt_dims = GetTensorDims(nodes_[node.GetInputs()[1].id_]);
    auto weight = MakeCLMLTensorFromJSONEntry(node.GetInputs()[1].id_, {1, 1, wt_dims.c, wt_dims.h},
                                              CL_TENSOR_LAYOUT_NCHW_QCOM, cl_dtype);

    std::vector<int64_t> out_shape = node.GetOpShape()[0];
    std::vector<size_t> clml_out_shape;
    clml_out_shape.push_back(out_shape[1]);
    clml_out_shape.push_back(out_shape[2]);
    clml_out_shape.push_back(1);
    clml_out_shape.push_back(1);
    auto output =
        MakeCLMLTensorFromJSONEntry(nid, clml_out_shape, CL_TENSOR_LAYOUT_NCHW_QCOM, cl_dtype);
    layer->out_shapes.insert({nid, clml_out_shape});

    cl_bool b_transpose = std::stoi(node.GetAttr<std::vector<std::string>>("transpose_b")[0]);
    cl_gemm_transform_qcom b_transform = CL_GEMM_TRANSFORM_NONE_QCOM;
    if (b_transpose) {
      b_transform = CL_GEMM_TRANSFORM_TRANSPOSE_QCOM;
    }
    cl_ml_op_gemm_desc_qcom gemmDesc = {in_dims.c,                    // m
                                        wt_dims.c,                    // n
                                        wt_dims.h,                    // k
                                        CL_GEMM_TRANSFORM_NONE_QCOM,  // A transform
                                        b_transform,                  // B transform
                                        {{1.0}, CL_FLOAT},            // alpha
                                        {{0.0}, CL_FLOAT},            // beta
                                        cl_arithmetic_mode};

    CLML_CALL(clCreateMLOpGemmQCOM, CLML_CTX, nullptr, &gemmDesc, input->tensor, weight->tensor,
              output->tensor, &op, layer_.tuning_cache);
    ICHECK(op) << "BatchMatmul Error";

    layer->function.push_back(op);
    return;
  }

  /*!
   * \brief Create a Batch matmul layer(batch_size=1 supported) Tensors with supported layout.
   *
   *
   * \param layer The CLML layer to build. Containing inputs, outputs and the CLML function.
   * \param node The JSON representation of the operator.
   * \param nid The node index of JSON graph node, which points to this operator.
   */
  void CreateBatchMatmulLayerTensor(CachedLayer* layer, const JSONGraphNode& node, size_t nid) {
    DLDataType tvm_dtype = node.GetOpDataType()[0];
    cl_channel_type cl_dtype = MakeCLDataType(tvm_dtype);
    auto in_dims = GetTensorDims(nodes_[node.GetInputs()[0].id_]);
    auto input = MakeCLMLTensorFromJSONEntry(node.GetInputs()[0].id_, {in_dims.c, in_dims.h},
                                             CL_TENSOR_LAYOUT_NCHW_QCOM, cl_dtype);
    auto wt_dims = GetTensorDims(nodes_[node.GetInputs()[1].id_]);
    auto weight = MakeCLMLTensorFromJSONEntry(node.GetInputs()[1].id_, {1, 1, wt_dims.c, wt_dims.h},
                                              CL_TENSOR_LAYOUT_NCHW_QCOM, cl_dtype);

    std::vector<int64_t> out_shape = node.GetOpShape()[0];
    std::vector<size_t> clml_out_shape;
    clml_out_shape.push_back(out_shape[1]);
    clml_out_shape.push_back(out_shape[2]);
    clml_out_shape.push_back(1);
    clml_out_shape.push_back(1);
    auto output =
        MakeCLMLTensorFromJSONEntry(nid, clml_out_shape, CL_TENSOR_LAYOUT_NCHW_QCOM, cl_dtype);
    layer->out_shapes.insert({nid, clml_out_shape});
    return;
  }

  /*!
   * \brief Create a Clip(X) layer.
   *
   * \param layer The CLML layer to build. Containing inputs, outputs and the CLML output.
   * \param node The JSON representation of the operator.
   * \param nid The node index of JSON graph node, which points to this operator.
   */
  void CreateClipLayer(CachedLayer* layer, const JSONGraphNode& node, size_t nid) {
    cl_ml_op_qcom op = nullptr;
    DLDataType tvm_dtype = node.GetOpDataType()[0];
    cl_channel_type cl_dtype = MakeCLDataType(tvm_dtype);
    cl_arithmetic_mode_qcom cl_arithmetic_mode = MakeCLArithMode(cl_dtype, cl_dtype);
    auto input = MakeCLMLTensorFromJSONEntry(node.GetInputs()[0].id_, {},
                                             CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype);
    auto output = MakeCLMLTensorFromJSONEntry(nid, {}, CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype);
    cl_float a_max = std::stof(node.GetAttr<std::vector<std::string>>("a_max")[0]);
    cl_float a_min = std::stof(node.GetAttr<std::vector<std::string>>("a_min")[0]);

    cl_ml_op_clip_desc_qcom clip_desc = {
        CL_CLIP_BY_VALUE_QCOM, {{a_max}, CL_FLOAT}, {{a_min}, CL_FLOAT}, cl_arithmetic_mode};

    CLML_CALL_clCreateMLOpClipQCOM(CLML_CTX, nullptr, &clip_desc, input->tensor, output->tensor,
                                   &op, layer_.tuning_cache);
    ICHECK(op) << "Clip Error";

    layer->function.push_back(op);
    return;
  }

  /*!
   * \brief Create a Binary layer.
   *
   * \param layer The CLML layer to build. Containing inputs, outputs and the CLML output.
   * \param node The JSON representation of the operator.
   * \param nid The node index of JSON graph node, which points to this operator.
   */
  void CreateBinaryLayer(CachedLayer* layer, const JSONGraphNode& node, size_t nid) {
    cl_ml_op_qcom op = nullptr;
    DLDataType tvm_dtype = node.GetOpDataType()[0];
    cl_channel_type cl_dtype = MakeCLDataType(tvm_dtype);
    cl_arithmetic_mode_qcom cl_arithmetic_mode = MakeCLArithMode(cl_dtype, cl_dtype);
    auto input_a = MakeCLMLTensorFromJSONEntry(node.GetInputs()[0].id_, {},
                                               CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype);
    auto input_b = MakeCLMLTensorFromJSONEntry(node.GetInputs()[1].id_, {},
                                               CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype);
    auto output = MakeCLMLTensorFromJSONEntry(nid, {}, CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype);
    std::string op_name = node.GetOpName();
    cl_binary_op_qcom binary_op = CL_TENSOR_OP_ADD_QCOM;
    if (op_name == "subtract" || PatternMatch(op_name, "relax.subtract"))
      binary_op = CL_TENSOR_OP_SUB_QCOM;
    else if (op_name == "multiply" || PatternMatch(op_name, "relax.multiply"))
      binary_op = CL_TENSOR_OP_MUL_QCOM;
    else if (op_name == "divide" || PatternMatch(op_name, "relax.divide"))
      binary_op = CL_TENSOR_OP_DIV_QCOM;
    else if (op_name == "minimum" || PatternMatch(op_name, "relax.minimum"))
      binary_op = CL_TENSOR_OP_MIN_QCOM;
    else if (op_name == "maximum" || PatternMatch(op_name, "relax.maximum"))
      binary_op = CL_TENSOR_OP_MAX_QCOM;
    else if (op_name == "add" || PatternMatch(op_name, "relax.add"))
      binary_op = CL_TENSOR_OP_ADD_QCOM;
    else
      LOG(FATAL) << "Undefined binary op:" << op_name;
    cl_ml_op_binary_desc_qcom add_desc = {
        binary_op, {{1.0}, CL_FLOAT}, {{1.0}, CL_FLOAT}, {{0.0}, CL_FLOAT}, cl_arithmetic_mode};
    LOG(INFO) << "Op name - " << op_name;
    CLML_CALL(clCreateMLOpBinaryQCOM, CLML_CTX, nullptr, &add_desc, input_a->tensor,
              input_b->tensor, output->tensor, &op, layer_.tuning_cache);
    ICHECK(op) << op_name << " Node Error";

    layer->function.push_back(op);
    return;
  }

  /*!
   * \brief Create a DepthToSpace(X) layer.
   *
   * \param layer The CLML layer to build. Containing inputs, outputs and the CLML output.
   * \param node The JSON representation of the operator.
   * \param nid The node index of JSON graph node, which points to this operator.
   */
  void CreateDepthToSpaceLayer(CachedLayer* layer, const JSONGraphNode& node, size_t nid) {
    cl_ml_op_qcom op = nullptr;
    DLDataType tvm_dtype = node.GetOpDataType()[0];
    cl_channel_type cl_dtype = MakeCLDataType(tvm_dtype);
    cl_arithmetic_mode_qcom cl_arithmetic_mode = MakeCLArithMode(cl_dtype, cl_dtype);
    auto input = MakeCLMLTensorFromJSONEntry(node.GetInputs()[0].id_, {},
                                             CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype);
    auto output = MakeCLMLTensorFromJSONEntry(nid, {}, CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype);
    cl_uint block_size = std::stoi(node.GetAttr<std::vector<std::string>>("block_size")[0]);

    cl_ml_op_depthtospace_desc_qcom dtos_desc = {block_size, cl_arithmetic_mode};
    CLML_CALL(clCreateMLOpDepthToSpaceQCOM, CLML_CTX, nullptr, &dtos_desc, input->tensor,
              output->tensor, &op, layer_.tuning_cache);
    ICHECK(op) << "DepthToSpace Layer Error";

    layer->function.push_back(op);
    return;
  }

  /*!
   * \brief Create a Resize(X) layer.
   *
   * \param layer The CLML layer to build. Containing inputs, outputs and the CLML output.
   * \param node The JSON representation of the operator.
   * \param nid The node index of JSON graph node, which points to this operator.
   */
  void CreateResizeLayer(CachedLayer* layer, const JSONGraphNode& node, size_t nid) {
    cl_ml_op_qcom op = nullptr;
    DLDataType tvm_dtype = node.GetOpDataType()[0];
    cl_channel_type cl_dtype = MakeCLDataType(tvm_dtype);
    cl_arithmetic_mode_qcom cl_arithmetic_mode = MakeCLArithMode(cl_dtype, cl_dtype);
    auto input = MakeCLMLTensorFromJSONEntry(node.GetInputs()[0].id_, {},
                                             CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype);
    auto output = MakeCLMLTensorFromJSONEntry(nid, {}, CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype);
    cl_bool align_corners = std::stoi(node.GetAttr<std::vector<std::string>>("align_corners")[0]);

    cl_ml_op_resize_bilinear_desc_qcom resize_desc = {align_corners, false, cl_arithmetic_mode};
    CLML_CALL(clCreateMLOpResizeBilinearQCOM, CLML_CTX, nullptr, &resize_desc, input->tensor,
              output->tensor, &op, layer_.tuning_cache);
    ICHECK(op) << "Resize Layer Error";

    layer->function.push_back(op);
    return;
  }

  /*!
   * \brief The network layers represented by acl functions.
   * \note Currently only supports a single layer.
   */

  // This layer instance
  CachedLayer layer_;

  // CLML Workspace
  CLMLWorkspace* cws;

#else
  void Run() override {
    LOG(FATAL) << "Cannot call run on CLML module without runtime enabled. "
               << "Please build with USE_CLML_GRAPH_EXECUTOR.";
  }

  void BuildEngine() {
    LOG(WARNING) << "CLML engine is not initialized. "
                 << "Please build with USE_CLML_GRAPH_EXECUTOR.";
  }
#endif
  bool CanDebug() override { return true; }

  /*! CLML sub graph symbol in TVM main module */
  std::string clml_symbol;
};

ffi::Module CLMLRuntimeCreate(const ffi::String& symbol_name, const ffi::String& graph_json,
                              const ffi::Array<ffi::String>& const_names) {
  auto n = ffi::make_object<CLMLRuntime>(symbol_name, graph_json, const_names);
  return ffi::Module(n);
}

TVM_FFI_STATIC_INIT_BLOCK() {
  namespace refl = tvm::ffi::reflection;
  refl::GlobalDef()
      .def("runtime.clml_runtime_create", CLMLRuntimeCreate)
      .def("ffi.Module.load_from_bytes.clml", JSONRuntimeBase::LoadFromBytes<CLMLRuntime>);
}
}  //  namespace contrib
}  //  namespace runtime
}  //  namespace tvm
